!pip install xgboost
!pip install wordcloud
!pip install plotly
import numpy as np # Multi-dimensional array object
import pandas as pd # Data Manipulation
import seaborn as sns # Data Visualization
import matplotlib.pyplot as plt # Data Visualization
import plotly.express as px # Interactive Data Visualization
from jupyterthemes import jtplot # Jupyter Notebook Theme
jtplot.style(theme = 'monokai', context = 'notebook', ticks = True, grid = False)
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # Offline version of the Plotly modules.
# Read the CSV file
# Load the top 10 instances
# Load the bottom 10 instances
# Display the feature columns
# Check the shape of the dataframe
# Check if any missing values are present in the dataframe
# Obtain the summary of the dataframe
# Convert MSRP and Invoice datatype to integer so we need to remove $ sign and comma (,) from these 2 columns
car_df["MSRP"] = car_df["MSRP"].str.replace("$", "")
car_df["MSRP"] = car_df["MSRP"].str.replace(",", "")
car_df["MSRP"] = car_df["MSRP"].astype(int)
car_df["MSRP"]
MINI CHALLENGE #1:
# Let's view the updated MSRP and Invoice Columns
# Display the updated summary of the dataframe
MINI CHALLENGE #2:
# scatterplots for joint relationships and histograms for univariate distributions
# Let's view various makes of the cars
fig = px.histogram(car_df, x = "Make",
labels = {"Make":"Manufacturer"},
title = "MAKE OF THE CAR",
color_discrete_sequence = ["maroon"])
fig.show()
# Let's view various types of the cars
car_df.Type.unique()
fig = px.histogram(car_df, x = "Type",
labels = {"Type":"Type"},
title = "TYPE OF THE CAR",
color_discrete_sequence = ["blue"])
fig.show()
# Let's plot the location
car_df.Origin.unique()
fig = px.histogram(car_df, x = "Origin",
labels = {"Origin":"Origin"},
title = "LOCATION OF THE CAR SALES",
color_discrete_sequence = ["brown"])
fig.show()
# Let's view the drivetrain of the cars
car_df.DriveTrain.unique()
fig = px.histogram(car_df, x = "DriveTrain",
labels = {"DriveTrain":"Drivetrain"},
title = "DRIVETRAIN OF THE CAR",
color_discrete_sequence = ["BLACK"])
fig.show()
# Plot the make of the car and its location
fig = px.histogram(car_df, x = "Make",
color = "Origin",
labels = {"Make":"Manufacturer"},
title = "MAKE OF THE CAR Vs LOCATION")
fig.show()
MINI CHALLENGE #3:
# Let's view the model of all used cars using WordCloud generator
from wordcloud import WordCloud, STOPWORDS
car_df
text = car_df.Model.values
stopwords = set(STOPWORDS)
wc = WordCloud(background_color = "black", max_words = 2000, max_font_size = 100, random_state = 3,
stopwords = stopwords, contour_width = 3).generate(str(text))
fig = plt.figure(figsize = (25, 15))
plt.imshow(wc, interpolation = "bilinear")
plt.axis("off")
plt.show()
# Obtain the correlation matrix
MINI CHALLENGE #4:
car_df.head()
# Perform One-Hot Encoding for "Make", "Model", "Type", "Origin", and "DriveTrain"
# Invoice feature does not contribute to car price prediction
df_data.shape
# Feeding input features to X and output (MSRP) to y
X = df_data.drop("MSRP", axis = 1)
y = df_data["MSRP"]
X = np.array(X)
y = np.array(y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.2)
MINI CHALLENGE #5:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score
from math import sqrt
accuracy_LinearRegression = LinearRegression_model.score(X_test, y_test)
accuracy_LinearRegression
# Photo Credits:
# https://creazilla.com/nodes/22202-giraffe-clipart
# https://pixy.org/4569488/
# https://pixabay.com/illustrations/monkey-animal-gorilla-zoo-nature-4187960/
# https://creazilla.com/nodes/15581-running-tiger-clipart
from sklearn.tree import DecisionTreeRegressor
accuracy_DecisionTree = DecisionTree_model.score(X_test, y_test)
accuracy_DecisionTree
from sklearn.ensemble import RandomForestRegressor
accuracy_RandomForest= RandomForest_model.score(X_test, y_test)
accuracy_RandomForest
from xgboost import XGBRegressor
accuracy_XGBoost = model.score(X_test, y_test)
accuracy_XGBoost
MINI CHALLENGE #6:
y_predict_linear = LinearRegression_model.predict(X_test)
fig = sns.regplot(y_predict_linear, y_test, color = 'red', marker = "^")
fig.set(title = "Linear Regression Model", xlabel = "Predicted Price of the used cars ($)", ylabel = "Actual Price of the used cars ($)")
RMSE= float(format(np.sqrt(mean_squared_error(y_test, y_predict_linear)), ".3f"))
MSE= mean_squared_error(y_test, y_predict_linear)
MAE= mean_absolute_error(y_test, y_predict_linear)
r2= r2_score(y_test, y_predict_linear)
print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2)
y_predict_RandomForest = RandomForest_model.predict(X_test)
fig = sns.regplot(y_predict_RandomForest, y_test, color = 'blue', marker = "s")
fig.set(title = "Random Forest Regression Model", xlabel = "Predicted Price of the used cars ($)", ylabel= "Actual Price of the used cars ($)")
RMSE= float(format(np.sqrt(mean_squared_error(y_test, y_predict_RandomForest)), ".3f"))
MSE= mean_squared_error(y_test, y_predict_RandomForest)
MAE= mean_absolute_error(y_test, y_predict_RandomForest)
r2= r2_score(y_test, y_predict_RandomForest)
print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2)
y_predict_XGBoost = model.predict(X_test)
fig = sns.regplot(y_predict_XGBoost, y_test, color = 'green', marker = "D")
fig.set(title = "XGBoost Model", xlabel = "Predicted Price of the used cars ($)", ylabel = "Actual Price of the used cars ($)")
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict_XGBoost)), ".3f"))
MSE = mean_squared_error(y_test, y_predict_XGBoost)
MAE = mean_absolute_error(y_test, y_predict_XGBoost)
r2 = r2_score(y_test, y_predict_XGBoost)
print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2)
From the ablove results, it is clearly shown that XGBoost model scores 94% accuracy which outperforms Linear Regression and Random Forest Regression models
MINI CHALLENGE #1 SOLUTION:
car_df["Invoice"] = car_df["Invoice"].str.replace("$", "")
car_df["Invoice"] = car_df["Invoice"].str.replace(",", "")
car_df["Invoice"] = car_df["Invoice"].astype(int)
MINI CHALLENGE #2 SOLUTION:
print(car_df.MSRP.max())
print(car_df.MSRP.min())
# Display the statistical details of the dataframe
car_df.describe()
MINI CHALLENGE #3 SOLUTION:
fig = px.histogram(car_df, x = "Make",
color = "Type",
labels = {"Make":"Manufacturer"},
title = "MAKE AND TYPE OF THE CAR",
opacity = 1)
fig.show()
-Porsche
-Honda and Toyota
MINI CHALLENGE #4 SOLUTION:
# Positive correlation between engine size and number of cylinders
# Positive correlation between horsepower and number of cylinders
# highest positive correlation with MSRP is = horsepower
MINI CHALLENGE #5 SOLUTION:
X_train.shape
X_test.shape
MINI CHALLENGE #6 SOLUTION:
# XG-boost